Now that the reads have gone through the DADA2 pipeline, we can start analyzing.

1 Load libraries and directories

library(magrittr)
library(phyloseq)
library(ggplot2)
library(fs)
library(tidyverse)
library(decontam)
library(microbiome)
library(ggrepel)
library(DT)
library(plotly)
library(ape)
library(picante)
library(DECIPHER)
library(phytools)
library(colorBlindness)
rm(list=ls()) # clear environment 

# Project-related directories: change as needed! 
dmc.dir = file.path("/hpc/group/dmc/Eva") # CHANGE ME to parent directory
git.dir = file.path(dmc.dir, "GLP_KO_Microbiome_Study") # CHANGE ME to Git Directory

map.file = file.path(git.dir, "Metadata_GLP1.csv")
meta.df = read_csv(map.file, show_col_types = FALSE)

sra.file = file.path(git.dir, "GLP1_study_SraRunTable.txt")
sra.df = read_csv(sra.file, show_col_types = FALSE)

ps.Run1.rds <- file.path(git.dir, "ps.Run1.rds")
ps.Run1 <- read_rds(ps.Run1.rds)

ps.rds <- file.path(git.dir, "ps.Run2.rds")
ps <- read_rds(ps.rds)
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3305 taxa and 280 samples ]
## sample_data() Sample Data:       [ 280 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 3305 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3305 reference sequences ]
# Sample return files from Duke Microbiome Core Facility

Run1.file = file.path("Run1_SampleReturn.csv")
Run1.df = read_csv(Run1.file, show_col_types = FALSE)

Run2.file = file.path("Run2_SampleReturn.csv")
Run2.df = read_csv(Run2.file, show_col_types = FALSE)

fecal.file = file.path(git.dir, "Run2_Fecal_submissions.csv")
fecal.df = read_csv(fecal.file, show_col_types = FALSE)

Note that we had 6 lung tissue samples that were submitted as a part of another run (“Run 1”) and another run with all the fecal and lung tissue samples (“Full Run”).

Directories for building a phylogenetic tree:

scratch.dir = file.path("/work/yk132/GLP_KO_Microbiome_Study") # CHANGE ME to project work directory 

Sys.setenv(SCRATCH_DIR = scratch.dir)
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3305 taxa and 280 samples ]
## sample_data() Sample Data:       [ 280 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 3305 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3305 reference sequences ]
ps.Run1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 179 taxa and 11 samples ]
## sample_data() Sample Data:       [ 11 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 179 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 179 reference sequences ]
subset_taxa(ps, Kingdom == "Bacteria") -> ps # remove non-bacterial taxa
subset_taxa(ps.Run1, Kingdom == "Bacteria") -> ps.Run1 # remove non-bacterial taxa

ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 280 samples ]
## sample_data() Sample Data:       [ 280 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.Run1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 179 taxa and 11 samples ]
## sample_data() Sample Data:       [ 11 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 179 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 179 reference sequences ]

And update metadata to include Run information:

sra.df %>% select(`Library Name`, Run) %>%
  dplyr::rename(Label = `Library Name`) %>%
  left_join(meta.df, by = "Label") -> meta.run

meta.run

2 Update Metadata for Run 1

The metadata tables have been merged multiple times with other data frames to encompass the host health data.

meta.run %>% filter(Sequencing_Run == "1") -> meta.Run1

meta.Run1$Label
##  [1] "JI-688"          "JI-678"          "JI-676"          "JI-670"         
##  [5] "JI-668"          "Saline-blank-3"  "PCR-neg1-eva"    "Kit-blank-2"    
##  [9] "zymo-eva"        "JI-650"          "Elution-blank-5"
Run1.df$`Sample Name`
##  [1] "JI-668"          "JI-650"          "JI-688"          "JI-676"         
##  [5] "JI-678"          "JI-670"          "Saline-blank-3"  "Kit-blank-2"    
##  [9] "Elution-blank-5" "PCR-neg1"        "zymo"

The labels are different between the metadata file and Run 1 dataframe. Let’s make them consistent; note that the metadata file reflects the actual sample names used in .fastq files, so we will use the metadata file’s labels.

Run1.df$`Sample Name`[Run1.df$`Sample Name` == "PCR-neg1"] <- "PCR-neg1-eva"
Run1.df$`Sample Name`[Run1.df$`Sample Name` == "zymo"] <- "zymo-eva"
all(Run1.df$`Sample Name` %in% meta.Run1$Label)
## [1] TRUE
Run1.df %>% dplyr::select(c("Sample Name", "QuBit ng/ul")) %>% 
  dplyr::rename(Label = "Sample Name",
                Qubit_ng_ul = "QuBit ng/ul") %>% 
  right_join(meta.Run1, by = "Label") %>% 
  column_to_rownames("Run") -> meta.Qubit.Run1
 
phyloseq::sample_data(ps.Run1) <- phyloseq::sample_data(meta.Qubit.Run1)

3 Decontam

Since we have data from two separate runs, we will run decontam separately for each phyloseq object and merge them at the end. Moreover, fecal and lung tissue samples were prepared in different ways using different kits and by different personnel in different labs. As such, we should run decontam by sample type to remove sample type-specific contaminating taxa. This may not get rid of all the contaminating taxa that come from the actual sequencing; as such, a final decontam will be run with the merged clean phyloseq objects.

This run of decontam contains all procedural blanks including PCR and diluting water blanks for fecal samples (fecal DNA was too high, so input DNA into PCR was diluted as necessary).

3.1 Run 1

Number of reads:

df <- as.data.frame(sample_data(ps.Run1)) # Put sample_data into a ggplot-friendly data.frame
df$LibrarySize <- sample_sums(ps.Run1)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
df %>% rownames_to_column("Sample") -> df
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Type, tooltip = Sample)) + geom_point() + theme_bw() -> p
ggplotly(p, tooltip = c("Sample"))

Let’s look at number of non-zero ASVs per sample:

otu_table(ps.Run1) %>% as.data.frame() -> df_otu
df_otu$ASV_Sums <- rowSums(df_otu != 0)
df_otu %>% rownames_to_column("Sample") %>% left_join(df, by = "Sample") -> df_otu
df_otu <- df_otu[order(df_otu$ASV_Sums),]
df_otu$otu_index <- seq(nrow(df_otu))
ggplot(data=df_otu, aes(x=otu_index, y=ASV_Sums, color=Type, tooltip = Sample)) + geom_point() + theme_bw() -> p2
ggplotly(p2, tooltip = c("Sample"))

The samples, in general, have a low number of ASVs.

While the number of reads for some samples aren’t too different from negative controls, it is good that samples at least seem to have a higher number of ASVs present.

I propose using decontam with both the prevalence and frequency-based contamination identification since the Qubit data after library preparation is available from DMC, though it needs to be merged with the metadata file.

I am removing contaminants that have been identified by either frequency or prevalence method. The threshold for frequency is 0.1, whereas that of prevalence method is 0.5 (i.e., reads are identified as contaminants if they are more prevalent in negative controls vs. the actual samples). For more information on decontam package, please refer to this vignette.

sample_data(ps.Run1)$is.neg <- sample_data(ps.Run1)$Type == "Neg Control"
ps.contam <- isContaminant(ps.Run1, method = "either",  neg = "is.neg", conc = "Qubit_ng_ul", threshold = c(0.1, 0.5)) # for frequency, the threshold order is frequency test then prevalence test. Using 0.5 for prevalence
table(ps.contam$contaminant)
## 
## FALSE  TRUE 
##   174     5
ps.contam %>% filter(contaminant == TRUE) %>% arrange(desc(freq))
ps.Run1.clean <- prune_taxa(!ps.contam$contaminant, ps.Run1)
ps.Run1.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 174 taxa and 11 samples ]
## sample_data() Sample Data:       [ 11 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 174 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 174 reference sequences ]

Now, I’ll graph the number of reads / ASVs per sample as before, but after decontam:

df <- as.data.frame(sample_data(ps.Run1.clean)) # Put sample_data into a ggplot-friendly data.frame
df$LibrarySize <- sample_sums(ps.Run1.clean)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
df %>% rownames_to_column("Sample") -> df
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Type, tooltip = Sample)) + geom_point() + theme_bw() -> p
ggplotly(p, tooltip = c("Sample"))
otu_table(ps.Run1.clean) %>% as.data.frame() -> df_otu
df_otu$ASV_Sums <- rowSums(df_otu != 0)
df_otu %>% rownames_to_column("Sample") %>% left_join(df, by = "Sample") -> df_otu
df_otu <- df_otu[order(df_otu$ASV_Sums),]
df_otu$otu_index <- seq(nrow(df_otu))
ggplot(data=df_otu, aes(x=otu_index, y=ASV_Sums, color=Type, tooltip = Sample)) + geom_point()  + theme_bw() -> p2
ggplotly(p2, tooltip = c("Sample"))

Since we only removed 5 taxa, they aren’t too different before and after.

3.2 Full Run

Let’s repeat the process with the full run.

df <- as.data.frame(sample_data(ps)) # Put sample_data into a ggplot-friendly data.frame
df$LibrarySize <- sample_sums(ps)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
rownames(df) -> tmp
df$Sample <- tmp
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Type, tooltip = Sample)) + geom_point() + facet_wrap(~Sample_type)+ theme_bw()  -> p
ggplotly(p, tooltip = c("Sample"))
otu_table(ps) %>% as.data.frame() -> df_otu
df_otu$ASV_Sums <- rowSums(df_otu != 0)
rownames(df_otu) -> tmp
df_otu$Sample <- tmp
df_otu %>% left_join(df, by = "Sample") -> df_otu
df_otu <- df_otu[order(df_otu$ASV_Sums),]
df_otu$otu_index <- seq(nrow(df_otu))
ggplot(data=df_otu, aes(x=otu_index, y=ASV_Sums, color=Type, tooltip = Sample)) + geom_point() + facet_wrap(~Sample_type)+ theme_bw()  -> p2
ggplotly(p2, tooltip = c("Sample"))

It is at least comforting that samples seem to have much more ASV’s than negative controls in general.

meta.run %>% dplyr::filter(Sequencing_Run == "2") %>% 
  select(-c("Concentration-(ng/uL)")) -> meta.Run2

Run2.df %>% dplyr::select(c("Sample Name", "QuBit ng/ul")) %>% 
  dplyr::rename(Label = "Sample Name", "Concentration-(ng/uL)" = "QuBit ng/ul") -> Run2.df.merge

fecal.df %>% dplyr::select(c("Sample-Index", "Concentration-(ng/uL)")) %>% 
  dplyr::rename(Label = "Sample-Index") %>% 
  rbind(Run2.df.merge) %>% 
  right_join(meta.Run2, by = "Label") %>%
  dplyr::rename(Qubit_ng_ul = "Concentration-(ng/uL)") %>%
  column_to_rownames("Run") ->  meta.Qubit.Run2

phyloseq::sample_data(ps) <- phyloseq::sample_data(meta.Qubit.Run2)

This is due to some samples not surviving the DADA2 process.

4 Decontam By Sample Type

Fecal and lung tissue samples were prepared in different ways using different kits. Furthermore, lung tissue samples were prepared by DMC, whereas fecal samples were prepared in-house. As such, we should run decontam by sample type to remove sample type-specific contaminating taxa. This may not get rid of all the contaminating taxa that come from the actual sequencing; as such, a final decontam will be run with the merged clean phyloseq objects.

ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 280 samples ]
## sample_data() Sample Data:       [ 280 samples by 40 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.Run1.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 174 taxa and 11 samples ]
## sample_data() Sample Data:       [ 11 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 174 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 174 reference sequences ]
merge_phyloseq(ps, ps.Run1.clean) -> ps.merge
ps.merge
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 291 samples ]
## sample_data() Sample Data:       [ 291 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.merge %>% subset_samples(Sample_type == "Fecal") -> ps.Fecal
ps.merge %>% subset_samples(Sample_type == "Lung Tissue") -> ps.LT
sample_data(ps.Fecal)$is.neg <- sample_data(ps.Fecal)$Type == "Neg Control"
ps.Fecal.contam <- isContaminant(ps.Fecal, method = "either",  neg = "is.neg", conc = "Qubit_ng_ul", threshold = c(0.1, 0.5)) # for frequency, the threshold order is frequency test then prevalence test. Using 0.5 for prevalence
## Warning in .is_contaminant(seqtab, conc = conc, neg = neg, method = method, :
## Removed 21 samples with zero total counts (or frequency).

## Warning in .is_contaminant(seqtab, conc = conc, neg = neg, method = method, :
## Removed 21 samples with zero total counts (or frequency).
table(ps.Fecal.contam$contaminant)
## 
## FALSE  TRUE 
##  3278    26
ps.Fecal.contam %>% filter(contaminant == TRUE) %>% arrange(desc(freq))
ps.Fecal.clean <- prune_taxa(!ps.Fecal.contam$contaminant, ps.Fecal)
ps.Fecal
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 228 samples ]
## sample_data() Sample Data:       [ 228 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.Fecal.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3278 taxa and 228 samples ]
## sample_data() Sample Data:       [ 228 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3278 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3278 reference sequences ]
sample_data(ps.LT)$is.neg <- sample_data(ps.LT)$Type == "Neg Control"
ps.LT.contam <- isContaminant(ps.LT, method = "either",  neg = "is.neg", conc = "Qubit_ng_ul", threshold = c(0.1, 0.5)) # for frequency, the threshold order is frequency test then prevalence test. Using 0.5 for prevalence
table(ps.LT.contam$contaminant)
## 
## FALSE  TRUE 
##  3271    33
ps.LT.contam %>% filter(contaminant == TRUE) %>% arrange(desc(freq))
ps.LT.clean <- prune_taxa(!ps.LT.contam$contaminant, ps.LT)
ps.LT
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.LT.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3271 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3271 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3271 reference sequences ]

And now merge the phyloseq objects again:

ps.merge
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 291 samples ]
## sample_data() Sample Data:       [ 291 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.Fecal.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3278 taxa and 228 samples ]
## sample_data() Sample Data:       [ 228 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3278 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3278 reference sequences ]
ps.LT.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3271 taxa and 63 samples ]
## sample_data() Sample Data:       [ 63 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3271 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3271 reference sequences ]
merge_phyloseq(ps.Fecal.clean, ps.LT.clean) -> ps.clean
ps.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 291 samples ]
## sample_data() Sample Data:       [ 291 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]

Graph after decontam:

df <- as.data.frame(sample_data(ps.clean)) # Put sample_data into a ggplot-friendly data.frame
df$LibrarySize <- sample_sums(ps.clean)
df <- df[order(df$LibrarySize),]
df$Index <- seq(nrow(df))
rownames(df) -> tmp
df$Sample <- tmp
ggplot(data=df, aes(x=Index, y=LibrarySize, color=Type, tooltip = Sample)) + geom_point() + facet_wrap(~Sample_type)+ theme_bw()  -> p
ggplotly(p, tooltip = c("Sample"))
otu_table(ps.clean) %>% as.data.frame() -> df_otu
df_otu$ASV_Sums <- rowSums(df_otu != 0)
rownames(df_otu) -> tmp
df_otu$Sample <- tmp
df_otu %>% left_join(df, by = "Sample") -> df_otu
df_otu <- df_otu[order(df_otu$ASV_Sums),]
df_otu$otu_index <- seq(nrow(df_otu))
ggplot(data=df_otu, aes(x=otu_index, y=ASV_Sums, color=Type, tooltip = Sample)) + geom_point() + facet_wrap(~Sample_type)+ theme_bw()  -> p2
ggplotly(p2, tooltip = c("Sample"))

And let’s remove the positive control:

ps.clean
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 291 samples ]
## sample_data() Sample Data:       [ 291 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps.clean %>% subset_samples(Type != "Pos Control") -> ps.clean.sam
ps.clean.sam
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 289 samples ]
## sample_data() Sample Data:       [ 289 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]

5 Remove samples with low number of reads

I will prune before building the phylogenetic tree; as a general rule of thumb, I prefer doing analyses on samples with > 1000 reads per sample.

sample_min_count = 1000

which(sample_sums(ps.clean.sam) < sample_min_count) 
## SRR29054908 SRR29054952 SRR29054953 SRR29054954 SRR29054955 SRR29054957 
##           8          21          22          23          24          26 
## SRR29054958 SRR29054959 SRR29054960 SRR29054961 SRR29054962 SRR29054963 
##          27          28          29          30          31          32 
## SRR29054964 SRR29054975 SRR29054976 SRR29054996 SRR29054997 SRR29055014 
##          33          36          37          57          58          75 
## SRR29055037 SRR29055038 SRR29055039 SRR29055040 SRR29055042 SRR29055044 
##          95          96          97          98         100         101 
## SRR29055045 SRR29055046 SRR29055047 SRR29055048 SRR29055049 SRR29055051 
##         102         103         104         105         106         108 
## SRR29055054 SRR29055055 SRR29055062 SRR29055064 SRR29055087 SRR29055089 
##         111         112         119         121         144         146 
## SRR29055090 SRR29055114 SRR29055118 SRR29055127 SRR29055153 SRR29055155 
##         147         171         175         184         192         194 
## SRR29055160 SRR29055161 SRR29055175 SRR29055176 SRR29055177 SRR29055178 
##         195         196         210         211         212         213 
## SRR29054972 SRR29055136 SRR29054920 SRR29054946 SRR29054949 SRR29054968 
##         257         262         280         285         286         287 
## SRR29055156 
##         289
which(sample_sums(ps.clean.sam) < sample_min_count) %>% length()
## [1] 55
meta.run %>% filter(Sample_type == "Fecal" & Type == "Neg Control") %>% select(Type, Run)

Many of these are blanks.

ps.clean.sam %>%
  prune_samples(sample_sums(.)>=sample_min_count, .) ->
  ps_filt

ps_filt
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 234 samples ]
## sample_data() Sample Data:       [ 234 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps_filt %>% subset_samples(Sample_type == "Lung Tissue" & Type == "True Sample")
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 44 samples ]
## sample_data() Sample Data:       [ 44 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]
ps_filt %>% subset_samples(Sample_type == "Fecal" & Type == "True Sample")
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 178 samples ]
## sample_data() Sample Data:       [ 178 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]

6 Build a phylogenetic tree

Now that we have removed contaminants, we can start analyses. First, let’s build a phylogenetic tree for distance-based beta diversity metrics.

6.0.1 Align ASVs

alignment <- AlignSeqs(ps_filt@refseq, anchor=NA, processors = NULL) # automatically detect & use available processors
input.alignment <- file.path(scratch.dir, "input_alignment_filt.fasta")
Sys.setenv(ALIGNED_ASV_FASTA=input.alignment)
writeXStringSet(alignment, filepath = input.alignment, format = "fasta")
set -u

raxmlHPC-PTHREADS -s $ALIGNED_ASV_FASTA -m GTRGAMMAIX -f a -p 1234 -x 2345 -N 100 -n alignment_filt -T 36 -w $SCRATCH_DIR # code from Dr. Granek
tree <- read.tree(file.path(scratch.dir, "RAxML_bipartitions.alignment_filt"))
str(tree)
## List of 5
##  $ edge       : int [1:6605, 1:2] 3305 3305 3306 3307 3307 3308 3308 3309 3309 3306 ...
##  $ edge.length: num [1:6605] 0.006643 0.000001 0.000001 0.006647 0.006453 ...
##  $ Nnode      : int 3302
##  $ node.label : chr [1:3302] "" "4" "8" "68" ...
##  $ tip.label  : chr [1:3304] "ASV1028" "ASV550" "ASV1027" "ASV1234" ...
##  - attr(*, "class")= chr "phylo"
##  - attr(*, "order")= chr "cladewise"
is.rooted(tree)
## [1] FALSE
rooted.tree <- phytools::midpoint.root(tree) # root for UniFrac
class(rooted.tree)
## [1] "phylo"
is.rooted(rooted.tree)
## [1] TRUE
ps.wtree <- merge_phyloseq(ps_filt, phy_tree(rooted.tree))
ps.rds = file.path(git.dir, paste0("ps.GLP1.rds")) 
write_rds(ps.wtree, ps.rds)
loaded.ps = read_rds(ps.rds)
print(loaded.ps)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3304 taxa and 234 samples ]
## sample_data() Sample Data:       [ 234 samples by 42 sample variables ]
## tax_table()   Taxonomy Table:    [ 3304 taxa by 6 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 3304 tips and 3303 internal nodes ]
## refseq()      DNAStringSet:      [ 3304 reference sequences ]

7 Reproducibility

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] colorBlindness_0.1.9 phytools_1.9-16      maps_3.4.1          
##  [4] DECIPHER_2.28.0      RSQLite_2.3.3        Biostrings_2.68.1   
##  [7] GenomeInfoDb_1.36.4  XVector_0.40.0       IRanges_2.34.1      
## [10] S4Vectors_0.38.2     BiocGenerics_0.46.0  picante_1.8.2       
## [13] nlme_3.1-163         vegan_2.6-4          lattice_0.22-5      
## [16] permute_0.9-7        ape_5.7-1            plotly_4.10.3       
## [19] DT_0.29              ggrepel_0.9.4        microbiome_1.22.0   
## [22] decontam_1.20.0      lubridate_1.9.3      forcats_1.0.0       
## [25] stringr_1.5.0        dplyr_1.1.3          purrr_1.0.2         
## [28] readr_2.1.4          tidyr_1.3.0          tibble_3.2.1        
## [31] tidyverse_2.0.0      fs_1.6.3             ggplot2_3.4.4       
## [34] phyloseq_1.44.0      magrittr_2.0.3      
## 
## loaded via a namespace (and not attached):
##  [1] mnormt_2.1.1            DBI_1.1.3               bitops_1.0-7           
##  [4] phangorn_2.11.1         rlang_1.1.1             ade4_1.7-22            
##  [7] compiler_4.3.1          mgcv_1.9-0              vctrs_0.6.4            
## [10] reshape2_1.4.4          combinat_0.0-8          quadprog_1.5-8         
## [13] pkgconfig_2.0.3         crayon_1.5.2            fastmap_1.1.1          
## [16] ellipsis_0.3.2          labeling_0.4.3          utf8_1.2.4             
## [19] rmarkdown_2.25          tzdb_0.4.0              bit_4.0.5              
## [22] xfun_0.40               zlibbioc_1.46.0         cachem_1.0.8           
## [25] clusterGeneration_1.3.8 jsonlite_1.8.7          biomformat_1.28.0      
## [28] blob_1.2.4              rhdf5filters_1.12.1     Rhdf5lib_1.22.1        
## [31] cluster_2.1.4           R6_2.5.1                bslib_0.5.1            
## [34] stringi_1.7.12          numDeriv_2016.8-1.1     jquerylib_0.1.4        
## [37] Rcpp_1.0.11             iterators_1.0.14        knitr_1.44             
## [40] optimParallel_1.0-2     Matrix_1.6-1.1          splines_4.3.1          
## [43] igraph_1.5.1            timechange_0.2.0        tidyselect_1.2.0       
## [46] rstudioapi_0.15.0       yaml_2.3.7              doParallel_1.0.17      
## [49] codetools_0.2-19        plyr_1.8.9              Biobase_2.60.0         
## [52] withr_2.5.1             coda_0.19-4             evaluate_0.22          
## [55] Rtsne_0.16              gridGraphics_0.5-1      survival_3.5-7         
## [58] pillar_1.9.0            foreach_1.5.2           generics_0.1.3         
## [61] vroom_1.6.4             RCurl_1.98-1.13         hms_1.1.3              
## [64] munsell_0.5.0           scales_1.2.1            glue_1.6.2             
## [67] scatterplot3d_0.3-44    lazyeval_0.2.2          tools_4.3.1            
## [70] data.table_1.14.8       cowplot_1.1.1           fastmatch_1.1-4        
## [73] rhdf5_2.44.0            grid_4.3.1              plotrix_3.8-2          
## [76] crosstalk_1.2.0         colorspace_2.1-0        GenomeInfoDbData_1.2.10
## [79] cli_3.6.1               expm_0.999-7            fansi_1.0.5            
## [82] viridisLite_0.4.2       gtable_0.3.4            sass_0.4.7             
## [85] digest_0.6.33           farver_2.1.1            htmlwidgets_1.6.2      
## [88] memoise_2.0.1           htmltools_0.5.6.1       multtest_2.56.0        
## [91] lifecycle_1.0.3         httr_1.4.7              bit64_4.0.5            
## [94] MASS_7.3-60